{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3fc7436a-f41e-47a0-a0e6-e2e6e584464d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n",
      "  from pandas.core import (\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "\n",
    "data = pd.read_csv('text2cypher_questions.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f9545f22-4228-4825-8222-67d3758c7dc7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>type</th>\n",
       "      <th>database</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>What are the top 5 movies with a runtime great...</td>\n",
       "      <td>Simple Retrieval Queries</td>\n",
       "      <td>recommendations</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>List the first 3 directors born before 1950.</td>\n",
       "      <td>Simple Retrieval Queries</td>\n",
       "      <td>recommendations</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Which 5 users have rated more than 20 movies?</td>\n",
       "      <td>Simple Retrieval Queries</td>\n",
       "      <td>recommendations</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Identify the top 5 actors who have acted in mo...</td>\n",
       "      <td>Simple Retrieval Queries</td>\n",
       "      <td>recommendations</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>What are the top 3 genres associated with movi...</td>\n",
       "      <td>Simple Retrieval Queries</td>\n",
       "      <td>recommendations</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            question  \\\n",
       "0  What are the top 5 movies with a runtime great...   \n",
       "1       List the first 3 directors born before 1950.   \n",
       "2      Which 5 users have rated more than 20 movies?   \n",
       "3  Identify the top 5 actors who have acted in mo...   \n",
       "4  What are the top 3 genres associated with movi...   \n",
       "\n",
       "                       type         database  \n",
       "0  Simple Retrieval Queries  recommendations  \n",
       "1  Simple Retrieval Queries  recommendations  \n",
       "2  Simple Retrieval Queries  recommendations  \n",
       "3  Simple Retrieval Queries  recommendations  \n",
       "4  Simple Retrieval Queries  recommendations  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b6ba962a-4cf3-470a-aee9-bfb145d69af2",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data[data['database'] != 'slack']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3fc6307c-80b2-4e08-98e0-7921da8a9b72",
   "metadata": {},
   "outputs": [],
   "source": [
    "schemas_df = pd.read_csv('text2cypher_schemas.csv')\n",
    "schemas_df.head()\n",
    "schemas = {}\n",
    "for i, row in schemas_df.iterrows():\n",
    "    schemas[row['database']] = row['schema']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2519f9b6-4e69-4630-9089-95d21207646b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create OpenAI Batch processing JSONL file\n",
    "\n",
    "system = \"\"\"Given an input question, convert it to a Cypher query.\n",
    "To translate a question into a Cypher query, please follow these steps:\n",
    "\n",
    "1. Carefully analyze the provided graph schema to understand what nodes, relationships, and properties are available. Pay attention to the node labels, relationship types, and property keys.\n",
    "\n",
    "2. Identify the key entities and relationships mentioned in the natural language question. Map these to the corresponding node labels, relationship types, and properties in the graph schema.\n",
    "\n",
    "3. Think through how to construct a Cypher query to retrieve the requested information step-by-step. Focus on:\n",
    "   - Identifying the starting node(s) \n",
    "   - Traversing the necessary relationships\n",
    "   - Filtering based on property values\n",
    "   - Returning the requested information\n",
    "Feel free to use multiple MATCH, WHERE, and RETURN clauses as needed.\n",
    "\n",
    "4. Once you have finished constructing the Cypher query, provide the final query inside triple backticks ```cypher```.\n",
    "\n",
    "5. Explain how your Cypher query will retrieve the necessary information from the graph to answer the original question. Provide this explanation inside <explanation> tags.\n",
    "\n",
    "Remember, the goal is to construct a Cypher query that will retrieve the relevant information to answer the question based on the given graph schema. Carefully map the entities and relationships in the question to the nodes, relationships, and properties in the schema.\n",
    "\n",
    "If the question cannot be answered by the information in the graph schema, say \"Based on the given graph schema, there is not enough information to answer this question.\" inside ```cypher``` tags. Do not attempt to construct a query if the schema does not support it. \"\"\"\n",
    "\n",
    "\n",
    "def create_line(question, database, id):\n",
    "    schema = schemas[database]\n",
    "    user_message = f\"\"\"Based on the Neo4j graph schema below, write a Cypher query that would answer the user's question:\n",
    "    {schema}\n",
    "    \n",
    "    Question: {question}\n",
    "    Cypher query:\"\"\"\n",
    "\n",
    "    messages = [\n",
    "        {\"role\": \"system\", \"content\": system},\n",
    "        {\"role\": \"user\", \"content\": user_message},\n",
    "    ]\n",
    "    line = {\n",
    "        \"custom_id\": \"request-\" + str(id),\n",
    "        \"method\": \"POST\",\n",
    "        \"url\": \"/v1/chat/completions\",\n",
    "        \"body\": {\n",
    "            \"model\": \"gpt-4o\",\n",
    "            \"messages\": messages,\n",
    "            \"max_tokens\": 1000,\n",
    "            \"temperature\": 0,\n",
    "        },\n",
    "    }\n",
    "    return line\n",
    "\n",
    "\n",
    "with open(\"text2cypher_batch_input.jsonl\", \"w\") as outfile:\n",
    "    for i, q in data.iterrows():\n",
    "        line = create_line(q[\"question\"], q[\"database\"], i)\n",
    "        json.dump(line, outfile)\n",
    "        outfile.write(\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f05eea04-846b-40d4-a728-e285f1ff9bfd",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
